{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "39329df3-1f99-4b11-9405-5969d52368a7",
   "metadata": {},
   "source": [
    "# Deciscion Tree & Hyperopt Example"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7d5f0ab-33cd-40f2-82e7-fb2747f04f89",
   "metadata": {},
   "source": [
    "Example showing how to use the Hyperopt library (http://hyperopt.github.io) for Bayesian hyperparameter optimization (via tree of parzen estimator)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7f61a90e-a119-4bd0-af21-38604c5b4eec",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "scikit-learn: 1.0\n",
      "hyperopt    : 0.2.5\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%load_ext watermark\n",
    "%watermark -p scikit-learn,hyperopt"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f0489c2-dd9c-4e71-a78c-e01201762b37",
   "metadata": {},
   "source": [
    "## Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "271b17ff-5ea4-4161-8b7f-20ba8131d666",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train/Valid/Test sizes: 398 80 171\n"
     ]
    }
   ],
   "source": [
    "from sklearn import model_selection\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import datasets\n",
    "\n",
    "\n",
    "data = datasets.load_breast_cancer()\n",
    "X, y = data.data, data.target\n",
    "\n",
    "X_train, X_test, y_train, y_test = \\\n",
    "    train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)\n",
    "\n",
    "X_train_sub, X_valid, y_train_sub, y_valid = \\\n",
    "    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)\n",
    "\n",
    "print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0affc454-9f07-48e6-bcee-e6253d968247",
   "metadata": {},
   "source": [
    "## Hyperopt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "53282fd6-1292-4b4d-a0b7-980707d61c3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from hyperopt import Trials, STATUS_OK, tpe, hp, fmin\n",
    "import hyperopt.pyll.stochastic"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5435889f-3cd7-45cd-abb2-632e3b034194",
   "metadata": {},
   "source": [
    "Some random sampling examples:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "7ca6f8f6-0c78-434a-8121-a83b5708e143",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.8925662130833578"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hyperopt.pyll.stochastic.sample(hp.loguniform('test', 1e-5, 1)) # range e^{low} to e^{high}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b2adc867-2d5a-44bd-8115-195ed53d6a7e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.1"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hyperopt.pyll.stochastic.sample(hp.qloguniform('test', 1e-5, 1, 0.1)) # rounded to 0.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9a6bb270-d2a1-4179-a770-39bad5a8332c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "\n",
    "params =  {\n",
    "    'min_samples_split': hp.choice('min_samples_split', np.arange(2, 10)),\n",
    "    'min_impurity_decrease': hp.quniform('min_impurity_decrease', 0.0, 0.5, 0.05),\n",
    "    'max_depth': hp.choice('max_depth', [6, 16, None])\n",
    "}\n",
    "\n",
    "\n",
    "\n",
    "def optimization_objective(params):\n",
    "\n",
    "\n",
    "    tree = DecisionTreeClassifier(random_state=123, **params)\n",
    "    tree.fit(X_train, y_train)\n",
    "    \n",
    "    accuracies = cross_val_score(\n",
    "        estimator=tree, X=X_train, y=y_train, cv=10, n_jobs=-1)\n",
    "\n",
    "    score = accuracies.mean()\n",
    "\n",
    "    return {'loss':1-score, 'status': STATUS_OK}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a51829c6-234f-401f-84ed-a005f71d0150",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100%|████████| 50/50 [00:01<00:00, 32.09trial/s, best loss: 0.06756410256410261]\n"
     ]
    }
   ],
   "source": [
    "trials = Trials()\n",
    "best = fmin(fn=optimization_objective,\n",
    "            space=params,\n",
    "            algo=tpe.suggest,\n",
    "            max_evals=50,\n",
    "            trials=trials)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "2c26399d-ebfc-4b06-86d9-36e49711e908",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'max_depth': 2, 'min_impurity_decrease': 0.0, 'min_samples_split': 5}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "best"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42380f27-d982-4ae8-8981-17b7224ebb04",
   "metadata": {},
   "source": [
    "- Attention, `fmin` returns results from `hp.choice` as an index!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "83e99f85-9ce2-494e-99ea-20ab49dc0b15",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'max_depth': None, 'min_impurity_decrease': 0.0, 'min_samples_split': 7}\n"
     ]
    }
   ],
   "source": [
    "from hyperopt import space_eval\n",
    "\n",
    "best_params = space_eval(params, best)\n",
    "print(best_params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "fbb610d8-4846-4e9f-a589-adacd0042603",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DecisionTreeClassifier(min_samples_split=7, random_state=123)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tree = DecisionTreeClassifier(random_state=123, **best_params)\n",
    "tree.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "763e816b-6437-45a9-812f-8b429472d75e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training Accuracy: 0.99\n",
      "Test Accuracy: 0.94\n"
     ]
    }
   ],
   "source": [
    "print(f\"Training Accuracy: {tree.score(X_train, y_train):0.2f}\")\n",
    "print(f\"Test Accuracy: {tree.score(X_test, y_test):0.2f}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
